{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Copyright 2016-present, Facebook, Inc.\n",
    "# All rights reserved.\n",
    "\n",
    "# This source code is licensed under the license found in the\n",
    "# LICENSE-examples file in the root directory of this source tree."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluate pysparnn on 20 Newsgroups data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import time\n",
    "from scipy.sparse import csr_matrix\n",
    "from sklearn.datasets import fetch_20newsgroups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# make sure you run 'python setup.py install' first!\n",
    "import pysparnn.cluster_index as ci"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Get data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dataset = fetch_20newsgroups(subset='all', shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num docs: 18846\n",
      "Avg doc length: 283.6560012734798\n",
      "Num unique words: 386410\n"
     ]
    }
   ],
   "source": [
    "print('Num docs: {}'.format(len(dataset.data)))\n",
    "print('Avg doc length: {}'.format(np.mean([len(x.split()) for x in dataset.data])))\n",
    "words = set()\n",
    "for doc in dataset.data:\n",
    "    words.update(doc.split())\n",
    "print('Num unique words: {}'.format(len(words)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Turn documents into vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.neighbors import LSHForest, NearestNeighbors \n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tv = TfidfVectorizer(decode_error='ignore')\n",
    "\n",
    "features = csr_matrix(tv.fit_transform(dataset.data))\n",
    "\n",
    "doc_index = np.array(range(len(dataset.data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_features = features[:200]\n",
    "train_features = features[200:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18646, 173762)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn import datasets\n",
    "\n",
    "np.random.seed(5)\n",
    "\n",
    "centers = [[1, 1], [-1, -1], [1, -1]]\n",
    "iris = datasets.load_iris()\n",
    "X = iris.data\n",
    "y = iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "k_means = KMeans(n_clusters=int(np.sqrt(train_features.shape[0])), max_iter=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1223.7469282150269"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "\n",
    "k_means.fit(train_features)\n",
    "\n",
    "time.time() - t0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16.347729921340942"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)\n",
    "time.time() - t0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an answer key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "knn = NearestNeighbors()\n",
    "        \n",
    "knn.fit(train_features)\n",
    "\n",
    "# get top 3 nearest neighbors for each document\n",
    "answers = knn.kneighbors(test_features, 3, return_distance=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build models to compare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "snn = ci.MultiClusterIndex(train_features, doc_index, num_indexes=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,\n",
       "     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lshf = LSHForest()\n",
    "        \n",
    "lshf.fit(train_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pysparnn_utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Percent of time snn returns a top 3 result: 0.66\n"
     ]
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "\n",
    "results = snn.search(test_features, return_distance=False, num_indexes=1)\n",
    "\n",
    "print('Percent of time snn returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
    "\n",
    "snn_time = time.time() - t0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Percent of time lsh returns a top 3 result: 0.143\n"
     ]
    }
   ],
   "source": [
    "t0 = time.time()\n",
    "\n",
    "results = lshf.kneighbors(test_features, return_distance=False)\n",
    "\n",
    "print('Percent of time lsh returns a top 3 result: {}'.format(pysparnn_utils.recall(answers, results).mean()))\n",
    "\n",
    "lsh_time = time.time() - t0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.112146987324278"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# LSH is x times slower than snn\n",
    "lsh_time / snn_time"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
